

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import sys
sys.path.append('/home/dalton_m/payload')
from basicfunctions import *

filename = 'ANpppv6'
resultsloc1 = "/home/dalton_m/ppp/"
from datetime import date
datestr = date.today().strftime(format="%Y%m%d")
resultsloc = resultsloc1 + datestr + '/'
if not os.path.exists(resultsloc):
    os.makedirs(resultsloc)
logging.basicConfig(filename=resultsloc + filename + '.txt', level=logging.ERROR,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logging.debug('This message should go to the log file')
logging.info('So should this')
logging.warning('And this, too')
logging.exception('And this, too')
logging.captureWarnings(True)
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from pylab import *
import seaborn as sns
sys.path.append(resultsloc1 + '')
from rona_estimatefunctions_gpu import *

import matplotlib.font_manager as fm
mpl.rcParams['font.family']='serif'
cmfont = fm.FontProperties(fname=mpl.get_data_path() + '/fonts/ttf/cmr10.ttf')
mpl.rcParams['font.serif']=cmfont.get_name()
mpl.rcParams['mathtext.fontset']='cm'
mpl.rcParams['axes.unicode_minus']=False
sns.set_style('darkgrid', {'font.family' : 'serif',
                                   'font_serif':cmfont.get_name(),
                                   'mathtext.fontset' :'cm',
                                   'axes.unicode_minus' : False })

cutofflist = ['<50% of cutoff','50-100% of cutoff', '100-150% of cutoff','150+% of cutoff',]


'''
Occupation Distribution
'''
# def eidlinfo(dfldb, eidlt):
#     dfeidl = eidltype(eidlt)
#     ldblist = [0, 233800029.75, 347309452.0, 398000339.5, 9e9]
#     tldb = []
#     mc = ['ldb_num']
#     for i in range(len(ldblist) - 1):
#         cond = (dfldb['ldb_num'] >= ldblist[i]) & (dfldb['ldb_num'] < ldblist[i + 1])
#         condb = (dfeidl['ldb_num'] >= ldblist[i]) & (dfeidl['ldb_num'] < ldblist[i + 1])
#         savenew(sqlmerge(dfldb.loc[cond], dfeidl.loc[condb],mc), 'pppfiles/partial_ANpppv6_' + eidlt + str(i))
#     dfeidl = None
#     return()
def eidltype(eidlt):
    dfloan = openeidl(eidlt)
    dfloan['eidl'+ eidlt +'_date'] = pd.to_datetime(dfloan['ACTIONDATE'].to_pandas(), format="%Y%m%d")
    if eidlt == 'loan':
        cond = (dfloan['FACEVALUEOFDIRECTLOANORLOANGUARANTEE'] > 0).to_array()
    elif eidlt == 'grant':
        cond = (dfloan['FEDERALACTIONOBLIGATION'] > 0).to_array()
    dfloan['D_eidl_' + eidlt] = np.where(cond, 1, 0)
    kcond = (dfloan['Ddropcond'] != 1)
    dfloan = dfloan.loc[kcond]
    return(dfloan[['ldb_num', 'D_eidl_'+eidlt, 'eidl'+ eidlt +'_date']])
def oesgraphs(dfldb, dfp):
    #oes
    df = cudf.read_csv(dataloc + 'oes/oes2digit_2019.csv')
    df['ldb_num'] = pd.to_numeric(df['UDBNum'].to_pandas())
    kcols = [i for i in df if i not in ['fipsstate', 'Schnum', 'ui_acct', 'run_num',
                                        'UDBNum', 'mval', 'ldb_num']]
    mc = ['ldb_num']
    tquery = '''SELECT * FROM dfl INNER JOIN dfr ON dfl.ldb_num = dfr.ldb_num'''
    df = sqlmerge(df, dfldb[mc + ['Deligible', 'naics', 'Dfranchise', 'ein']].drop_duplicates(mc),tquery)
    #merge ppp and oes
    condoes = ((df['naics'].astype('str').str[:2] == '72') | (df['Dfranchise'] == 1))
    #condppp = ((dfp['NAICSCode'].astype('str').str[:2] == '72') | (dfp['Dfranchise'] == 1))
    tquery = '''SELECT * FROM dfl INNER JOIN dfr ON dfl.ldb_num = dfr.ldb_num'''
    df1a = sqlmerge(df.loc[condoes],dfp[mc + ['LoanAmount_ein','LoanAmount', ]],tquery)
    mc = ['ein']
    aggstats = {
        'LoanAmount': 'sum',
    }
    df1 = sqlmerge(df.loc[~condoes],dfp.groupby(mc).agg(aggstats).reset_index(),mc)
    df = cudf.concat([df1a,df1], ignore_index=True)
    mc = ['ldb_num']
    logging.exception('checking DF size, removing duplicate ldb numbers')
    logging.exception(len(df))
    logging.exception(len(df.drop_duplicates(mc)))
    logging.exception('drops ~17000 observations')
    df = df.drop_duplicates(mc)
    #dictionary for SOC codes
    temptext = open(dataloc + 'oes/2digsoctext.txt', 'r').read()
    df1 = pd.read_csv(io.StringIO(temptext), header=0, sep=";;;").set_index(keys='2digsoc')
    df1.loc[:, 'occ0title'] = df1['occ0title'].str.replace('Occupations', '').replace(',', '')
    #every occ except military
    socdict = dict(zip([str(i) for i in df1.index][:-1], [i.strip() for i in df1.occ0title][:-1]))
    #fill in 0 for missings
    df = df.fillna(0)
    #Dppp
    # dummy for ppp receipt at EIN level
    cond = ((df['LoanAmount'] > 0) | (df['LoanAmount_ein'] > 0)).to_array()
    df['Dppp'] = np.where(cond, 1, 0)

    # ###get amount and date approved
    # df['approval_date'] = pd.to_datetime(df['DateApproved_ein'].fillna(df['DateApproved']).to_pandas(), errors='coerce')
    # df['amount_final'] = df['LoanAmount_ein'].fillna(df['LoanAmount'])

    #total emp for each occ by status
    empcols = [i for i in df if i.startswith('emp_occ')]
    empcolswgt = [i+'_wgt' for i in empcols]
    for c in empcols:
        df[c + '_wgt'] = df[c] * df['FinalWgt']
    #rename columns for graphs
    renamedict = {i : socdict[i.split('_')[-1]] for i in empcols}
    renamedictwgt = {i+'_wgt' : socdict[i.split('_')[-1]] for i in empcols}
    #graph
    g2dict = {
        #'allldb' : ['All LDB'],
        'Dppp': ['PPP Recipients', (df['Dppp'] == 1)],
        'Deligible' : ['Eligible Establishments', (df['Deligible']==1)],
    }
    colors = ['#ff7f0e','#1f77b4',  '#2ca02c', '#d62728', '#9467bd', '#8c564b',]
    kvar = '2digitSOC'
    roworder = [i for i in socdict.values()]
    xlab = '2-Digit SOC'
    # graph
    x = np.arange(len(roworder))  # the label locations
    numcats = len(g2dict)
    width = 0.25  # the width of the bars
    groupwidth = width * numcats
    numcols = 1
    numrows = 1
    f, ax = plt.subplots(figsize=(8 * numcols, 7.2 * numrows), constrained_layout=True)
    # need a nudge to push x to the right - i'm not sure exactly why it won't line
    # up without this, but this works for 2 categories, might generalize
    xnudge = .6 * width
    #
    i = 0
    for k, v in g2dict.items():
        ax.barh(x - groupwidth / 2 + xnudge + i * width,
                pd.DataFrame(df[v[1]][empcols].to_pandas().sum()).T.rename(columns = renamedict)[roworder].values[0] / df[v[1]][empcols].sum().sum(),
                width, label=v[0], color = colors[i])
        i = i + 1
    #need to reorder the legend because horizontal bar chart does something dumb
    handles, labels = ax.get_legend_handles_labels()
    order = [1, 0]
    ax.legend([handles[idx] for idx in order],[labels[idx] for idx in order])
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(xlab)
    ax.set_xlabel('Proportion')
    ax.set_title('Employment Distribution Within\nSame-Colored Bars')
    ax.set_yticks(x)
    ax.set_yticklabels(roworder)
    f.tight_layout()
    plt.savefig(resultsloc + filename + '_' + kvar + '.pdf')
    # weighted
    f, ax = plt.subplots(figsize=(8 * numcols, 7.2 * numrows), constrained_layout=True)
    # need a nudge to push x to the right - i'm not sure exactly why it won't line
    # up without this, but this works for 2 categories, might generalize
    xnudge = .6 * width
    #
    i = 0
    for k, v in g2dict.items():
        ax.barh(x - groupwidth / 2 + xnudge + i * width,
                pd.DataFrame(df[v[1]][empcolswgt].to_pandas().sum()).T.rename(columns = renamedictwgt)[roworder].values[0] / df[v[1]][empcolswgt].sum().sum(),
                width, label=v[0], color = colors[i])
        i = i + 1
    handles, labels = ax.get_legend_handles_labels()
    order = [1, 0]
    ax.legend([handles[idx] for idx in order],[labels[idx] for idx in order], loc='lower right')
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(xlab)
    ax.set_xlabel('Proportion')
    ax.set_title('Employment Distribution Within\nSame-Colored Bars, OES Sample Weights')
    ax.set_yticks(x)
    ax.set_yticklabels(roworder)
    f.tight_layout()
    plt.savefig(resultsloc + filename + '_empwgt_' + kvar + '.pdf')
    return
def pppaggregate(dfp, tlevel, mc):
    aggstats = {
        'LoanAmount': 'sum',
        'DateApproved': 'min'
    }
    renamedict = {
        'LoanAmount': 'LoanAmount_' + tlevel,
        'DateApproved': 'DateApproved_' + tlevel,
    }
    condppp = ((dfp['NAICSCode'].astype('str').str[:2] == '72') | (dfp['Dfranchise'] == 1))
    dfp1 = dfp.loc[~condppp][mc + list(aggstats.keys())].to_pandas().groupby(mc).agg(aggstats).reset_index().rename(
        columns=renamedict)
    dfp = dfp.merge(cudf.DataFrame(dfp1), on=mc, how = 'left')
    # dummy for ppp receipt at UI level
    cond = (dfp['LoanAmount_'+tlevel] > 0).to_array()
    dfp['Dppp_'+tlevel] = np.where(cond, 1, 0)
    return (dfp)
def getpppdata(year):
    dfp = openppp(year).drop_duplicates('ldb_num').drop(columns = 'city')
    '''df
    drop condition
    - tailoring fuzzy matches 
    '''
    dropcond = (dfp['Ddropcond'] == 0)
    dfp = dfp[dropcond]
    einlist = cudf.read_csv(dataloc + 'pppfiles/sba_franchise_fullymerged.csv', sep='|')[
        'ein'].unique().to_pandas().tolist()
    cond = (dfp['ein'].isin(einlist)).to_array()
    dfp['Dfranchise'] = np.where(cond, 1, 0)
    dfp['week_date'] = pd.to_datetime(dfp['DateApproved'].to_pandas().astype('datetime64[W]')) + pd.DateOffset(days=2)
    # print('ui')
    # dfp = pppaggregate(dfp, 'ui', ['fipsstate', 'ui_acct'])
    print('ein')
    dfp = pppaggregate(dfp, 'ein', ['ein'])
    print('dfp')
    savenew(dfp, 'pppfiles/pppagg'+str(year))
    return

def ldbppp():
    #this opens ldb and ppp files, then merges them
    dfldb = opennew('pppfiles/ldbppp', [])
    #'BusinessName', 'employer_fuzzy'
    dfp = opennew('pppfiles/pppagg',['ldb_cosine_fuzzy_a','ldb_cosine_fuzzy_b','ldb_cosine_fuzzy_c',
                                     'ldb_cosine_fuzzy','ldb_num','NAICSCode', 'Dfranchise', 'LoanAmount','week_date', 'DateApproved', 'LoanAmount_ui', 'DateApproved_ui',
                                     'LoanAmount_ein', 'DateApproved_ein','ldb_cosine_fuzzy',
                                     'fuzzy_group', 'fuzzy_type',  'ein', 'fipsstate', 'ui_acct'])
    for t in ['a', 'b', 'c']:
        dfp['ldb_cosine_fuzzy'] = dfp['ldb_cosine_fuzzy'].fillna(dfp['ldb_cosine_fuzzy_'+t])
    print('oes')
    oesgraphs(dfldb, dfp)
    mc = ['ldb_num']
    dfp = dfp[mc + ['LoanAmount', 'week_date', 'DateApproved', 'LoanAmount_ui', 'DateApproved_ui',
                    'LoanAmount_ein', 'DateApproved_ein','ldb_cosine_fuzzy',
                    'fuzzy_group', 'fuzzy_type', ]].drop_duplicates(mc)
    return(dfldb.merge(dfp, on=mc))

def eidlcombine():
    tquery = "SELECT * FROM dfl FULL OUTER JOIN dfr ON dfl.ldb_num = dfr.ldb_num"
    dft = eidltype('loan')
    dft = dft.loc[dft['ldb_num'].notnull()]
    dft1 = eidltype('grant')
    dft1 = dft1.loc[dft1['ldb_num'].notnull()]
    dfeidl = dft.merge(dft1, on = ['ldb_num'])
    savenew(dfeidl, 'pppfiles/combinedeidl')
    return
###already run
# import zipfile
# with zipfile.ZipFile(dataloc + 'pppfiles/ppp_files.zip') as z:
#     z.extractall("/eract/daltonm/pppfiles")
#eidlcombine()


###already made this,so commenting out
# for year in [2020,2021]:
#     getpppdata(year)



'''
combine ppp and ldb data
'''
dfp = opennew('pppfiles/pppagg', ['ldb_cosine_fuzzy'
                                  'ldb_cosine_fuzzy', 'ldb_num', 'NAICSCode', 'Dfranchise', 'LoanAmount', 'week_date',
                                  'DateApproved',
                                  'LoanAmount_ein', 'DateApproved_ein',
                                  'fuzzy_group', 'fuzzy_type', 'ein', 'fipsstate', 'ui_acct'])

# cond = (dfp['LoanAmount'] > 0).to_pandas()
# dfp['Dppp'] = np.where(cond, 1, 0)
# dummy for ppp receipt at EIN level
cond = ((dfp['LoanAmount'] > 0) | (dfp['LoanAmount_ein'] > 0)).to_array()
dfp['Dppp'] = np.where(cond, 1, 0)


###get amount and date approved
dfp['approval_date'] = pd.to_datetime(dfp['DateApproved_ein'].fillna(dfp['DateApproved']).to_pandas(), errors='coerce')
dfp['amount_final'] = dfp['LoanAmount_ein'].fillna(dfp['LoanAmount'])


kcols = ['cutoff_score', 'aaemp', 'naics','naics2','naics4', 'ein', 'owner_code', 'tot_wages_yr', 'ui_acct', "run_num",] + ['hhi_cut', 'cutoff_testcase', 'yr_growth_bins', 'age_bins', 'avg_wages_bin', 'ein_aaemp', 'fipsstate', 'fipscounty',]
dfldb = ldbdata(2019,kcols , '')
cond =  (dfldb['aaemp'] > 0) & (dfldb['tot_wages_yr'] > 0) & (dfldb['owner_code'] == 5) & ~(dfldb['naics2'].isin(['92', '99']))
dfldb = dfldb.loc[cond]
###must have at least osme employment in 2020
dfq = qcew(yr = 2020, qtr = '', own = '5', fips = '', cols = ['aaemp'])
#only keep if some employemtn in 2020 from march on
dfq['totemp20'] = dfq[['emp_m'+str(m) for m in range(3,13)]].sum(axis=1)
kcond = (dfq['totemp20'] > 0)
kcols = ['ldb_num', 'totemp20']
dfq = dfq[kcond][kcols]
#only keep if 2020 match
dfldb = dfldb.merge(dfq, how = 'inner', on = ['ldb_num'])
dfq = None
#firm size class
reallabels = categoricalvariables('firmsize').values()
labels = list(categoricalvariables('firmsize').keys())
bins = [-1, 10, 50, 250, 500, 1000, 5000, 9e9]
dfldb['firm_size_class'] = pd.cut(dfldb['ein_aaemp'].to_pandas(),bins = bins, labels = labels, right=False).astype('float')
#sizeclass 2
reallabels = categoricalvariables('size2').values()
labels = list(categoricalvariables('size2').keys())
bins = [-1, 3, 6, 10, 50, 250, 500, 9e9]
dfldb['size_class2'] = pd.cut(dfldb['aaemp'].to_pandas(),bins = bins, labels = labels, right=False).astype('float')
#naics name
naicsdict = naics2c()
maskdict = categoricalvariables('naics')
maskdict = {k:v for v,k in maskdict.items()}
dfldb['naics_name'] = dfldb['naics2'].map(naicsdict).map(maskdict)
einlist = cudf.read_csv(dataloc + 'pppfiles/sba_franchise_fullymerged.csv', sep='|')[
    'ein'].unique().to_pandas().tolist()
cond = (dfldb['ein'].isin(einlist)).to_array()
dfldb['Dfranchise'] = np.where(cond, 1, 0)
condldb = ((dfldb['naics4'].str[:2] == '72') | (dfldb['Dfranchise'] == 1)).to_array()
dfldb['D_naics72_or_franchise'] = np.where(condldb, 1, 0)
dfldb['year'] = [2019] * len(dfldb)
cond= (dfldb['cutoff_score'] <= 1).to_pandas()
dfldb['Deligible'] = np.where(cond, 1, 0)
print('oes')
# oesgraphs(dfldb, dfp)
mc = ['ldb_num']
#'LoanAmount_ui', 'DateApproved_ui','week_date',
dfp1 = dfp[mc + ['LoanAmount', 'DateApproved',
                 #'ldb_cosine_fuzzy','fuzzy_group', 'fuzzy_type',
                 ]].drop_duplicates(mc)
dfldb = dfldb.merge( dfp1, on = mc, how = 'left')
dfp1 = None
mc = ['ein']
dfp = dfp[mc + [
                'LoanAmount_ein', 'DateApproved_ein',
    #'ldb_cosine_fuzzy','fuzzy_group', 'fuzzy_type',
]].drop_duplicates(mc)
dfldb = dfldb.merge( dfp, on = mc, how = 'left')
dfp = None
cond = ((dfldb['LoanAmount'] > 0) | (dfldb['LoanAmount_ein'] > 0)).to_array()
dfldb['Dppp'] = np.where(cond, 1, 0)
dfldb['approval_date'] = pd.to_datetime(dfldb['DateApproved_ein'].fillna(dfldb['DateApproved']).to_pandas(), errors='coerce')
dfldb['week_date'] = pd.to_datetime(dfldb['approval_date'].to_pandas().astype('datetime64[W]')) + pd.DateOffset(days=2)
savenew(dfldb, 'pppfiles/'+filename + '_all')
# logging.exception('# of eligible versus ppp recipiency')
# logging.exception(dfldb.groupby(['Deligible','Dppp'])['year'].count())
# dfldb['zeroemp'] = (dfldb['aaemp']==0)
# logging.exception('# of 0 employm vs ppp receipt')
# logging.exception(dfldb.groupby(['zeroemp','Dppp'])['year'].count())

'''
overall recipiency
'''
logging.exception('% of eligible estab receiving')
cond = (dfldb['Deligible'] == 1)
logging.exception(dfldb[cond]['Dppp'].mean())
cond1 = (dfldb['Dppp'] == 1)
logging.exception('% of eligible employment receiving')
logging.exception(dfldb[cond & cond1]['aaemp'].sum() / dfldb[cond ]['aaemp'].sum())

'''
Franchise or NAICS 72
'''
logging.exception('% of eligible estab receiving - if franchise or NAICS 72')
cond = (dfldb['D_naics72_or_franchise'] == 1) & (dfldb['Deligible'] == 1)
logging.exception(dfldb[cond]['Dppp'].mean())
cond1 = (dfldb['Dppp'] == 1)
logging.exception('% of eligible employment receiving')
logging.exception(dfldb[cond & cond1]['aaemp'].sum() / dfldb[cond ]['aaemp'].sum())

'''
NAICS 72
'''
logging.exception('% of eligible estab receiving - if NAICS 72')
cond = (dfldb['naics4'].str[:2] == '72') & (dfldb['Deligible'] == 1)
logging.exception(dfldb[cond]['Dppp'].mean())
cond1 = (dfldb['Dppp'] == 1)
logging.exception('% of eligible employment receiving')
logging.exception(dfldb[cond & cond1]['aaemp'].sum() / dfldb[cond ]['aaemp'].sum())

'''
cesnus pulse comparability
'''
dfldb['naics2_test'] = dfldb['naics'].astype('str').str[:2]
dfldb['naics3'] = dfldb['naics'].astype('str').str[:3]
#industry cutoffs
cond = ((dfldb['naics3'].to_pandas().isin(['482', '491', '111', '112', '521', '525', '813', '814'])) | (dfldb['naics2_test'].to_pandas().isin(['92'])) | (dfldb['naics'].to_pandas() == 110000))
dfldb['censuspulseind'] = np.where(cond, 0, 1)
logging.exception('% of eligible estab receiving - Census Pulse definition')
cond = (dfldb['aaemp'] < 500) & (dfldb['ein_aaemp'] == dfldb['aaemp']) & (dfldb['Deligible'] == 1) & (dfldb['censuspulseind'] == 1)
logging.exception(dfldb[cond]['Dppp'].mean())
cond1 = (dfldb['Dppp'] == 1)
logging.exception('% of eligible employment receiving')
logging.exception(dfldb[cond & cond1]['aaemp'].sum() / dfldb[cond ]['aaemp'].sum())

'''
add in unmatched loans
'''



'''
making graphs for 3 groups
all ldb
eligible
received
'''
graphnum = 0
gmaindict = {
    'naics_name' : ['naics', 'Sector'],
    'size_class2' : ['size2', 'Establishment Size'],
    'firm_size_class' : ['firmsize', 'Firm Size'],
    'avg_wages_bin' : ['wage', 'Avg. Wage per Employee'],
    #'age_bins' : ['age', 'Establishment Age'],
    #'yr_growth_bins' : ['growth', 'Emp. Growth from 2018 to 2019'],
    #'cutoff_testcase' : ['cutoff', 'SBA Eligibility Criteria'],
    #'hhi_cut' : ['hhi', 'Herfendahl Index - Commuting Zone and 4-Digit NAICS'],
    #'bank_cz' : [banklist, '# of Eligible Business per 1 Bank in Commuting Zone'],
    # 'bank_fips' : [banklist, '# of Eligible Business per 1 Bank in County'],
    # #'bank_zip' : [banklist, '# of Eligible Business per 1 Bank in Zip Code'],
    # 'bank_dist_bins' : [bankdistlist, 'Distance to Nearest Bank']
}
'''
cum distribution
'''
# aggstats = {
#     'ldb_num' : 'count',
# }
# #, 'bank_zip','bank_fips', 'bank_dist_bins'
# for k in ['size_class2', 'firm_size_class', 'avg_wages_bin', 'age_bins', 'yr_growth_bins', 'hhi_cut']:
#     dft = opennew('pppfiles/'+filename + '_all', [k,'week_date', 'ldb_num']).groupby([k,'week_date']).agg(aggstats).reset_index().sort_values([k,'week_date']).to_pandas()
#     dft['week_date'] = pd.to_datetime(dft['week_date'].astype('datetime64[W]'))
#     dft[k] = dft[k].map(categoricalvariables(gmaindict[k][0]))
#     dft['prop_cumsum'] = dft.groupby(k)['ldb_num'].cumsum()
#     dft['tot_loans'] = dft.groupby(k)['ldb_num'].transform('sum')
#     dft['prop_cumsum'] = dft['prop_cumsum'] / dft['tot_loans']
#     dft = dft.set_index('week_date')
#     # ylim = [-20, .4]
#     graphnum = 1
#     numcols = 1
#     numrows = 1
#     f, ax = plt.subplots(figsize=(8 * numcols, 8 * numrows), constrained_layout=True)
#     for c in list(categoricalvariables(gmaindict[k][0]).values()):
#         cond = (dft[k]==c)
#         ax.plot(dft[cond]['prop_cumsum'], label='' + c)
#         # v1.append(dft[c][-1])
#     title = "Cumulative Distribution of Loans, by " + gmaindict[k][1]
#     titlewords = title.split(" ")
#     nwords = 5
#     # nchar = 30
#     # title = '\n'.join([title[i*nchar:(i+1)*nchar] for i in range((len(title) // nchar) + 1)])
#     title = '\n'.join([' '.join(titlewords[i*nwords: (i+1)*nwords]) for i in range(math.ceil(len(titlewords)  / nwords))])
#     ax.set_title(title, fontsize=15)  # for title
#     ax.set_xlabel("Week of 2020", fontsize=10)  # label for x-axis
#     ax.tick_params(axis='x', labelrotation= 90)
#     ax.set_ylabel("Proportion of Loans", fontsize=10)  # label for y-axis
#     ax.legend(loc='best', fontsize='small')
#     # ax.set_ylim(ylim)
#     plt.savefig(resultsloc + filename + '_cumdist_'+ k + '.pdf')
'''
graphs
'''
gdict = {
    #'allldb' : ['All LDB'],
    'Deligible' : ['Eligible Establishments', 'SELECT * FROM dfldb WHERE dfldb.Deligible=1'],
    'Dppp' : ['PPP Recipients', 'SELECT * FROM dfldb WHERE dfldb.Dppp=1']
}
for kvar, tlist in gmaindict.items():
    #dft = opennew('pppfiles/'+filename + '_all', [kvar, 'ldb_num', 'Deligible', 'Dppp', 'aaemp'])
    tdict = categoricalvariables(tlist[0])
    if kvar in ['avg_wages_bin']:
        tdict = {k:'$'+v+'$' for k,v in tdict.items()}
    dfldb[kvar + 'edit'] = dfldb[kvar].map(tdict)
    #bc.create_table('dfldb',dft)
    roworder = list(reversed(tdict.values()))
    xlab = tlist[1]
    #graph
    x = np.arange(len(roworder))  # the label locations
    numcats = len(gdict)
    width = 0.25  # the width of the bars
    groupwidth = width * numcats
    graphnum = graphnum + 1
    numcols = 1
    numrows = 1
    f, ax = plt.subplots(figsize=(6 * numcols, 7.2 * numrows), constrained_layout=True)
    #need a nudge to push x to the right - i'm not sure exactly why it won't line
    # up without this, but this works for 2 categories, might generalize
    xnudge = .6 * width
    #
    i=0
    ylab = 'Proportion'
    #for k,v in gdict.items():
    ax.barh(x - groupwidth/2 + xnudge + i *width, dfldb[(dfldb['Deligible']==1)&dfldb['Dppp']==1].groupby(kvar + 'edit')['ldb_num'].count().reindex(roworder).to_pandas() / dfldb[dfldb['Deligible']==1].groupby(kvar + 'edit')['ldb_num'].count().reindex(roworder).to_pandas(),
           width, label= ylab)
    #i = i+1
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(xlab)
    ax.set_xlabel(ylab)
    ax.set_title('PPP Take-Up Rate by Eligible Establishments')
    ax.set_yticks(x)
    ax.set_yticklabels(roworder)
    #ax.legend()
    f.tight_layout()
    plt.savefig(resultsloc + filename + '_' +kvar + '.pdf')
    ymin, ymax = ax.get_xlim()
    ylim = [ymin, ymax + .05]
    #empweighted
    f, ax = plt.subplots(figsize=(6 * numcols, 7.2 * numrows), constrained_layout=True)
    #
    i=0
    ax.barh(x - groupwidth/2 + xnudge + i *width, dfldb[(dfldb['Deligible']==1)&dfldb['Dppp']==1].groupby(kvar + 'edit')['aaemp'].sum().reindex(roworder).to_pandas() / dfldb[dfldb['Deligible']==1].groupby(kvar + 'edit')['aaemp'].sum().reindex(roworder).to_pandas(),
           width, label= ylab)
    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(xlab)
    ax.set_xlabel(ylab)
    ax.set_title('PPP Take-Up Rate by Eligible Establishments')
    ax.set_yticks(x)
    ax.set_yticklabels(roworder)
    #might need to change this if axes wildly change for employment weighted for some reason
    ax.set_xlim(ylim)
    #ax.legend()
    f.tight_layout()
    plt.savefig(resultsloc + filename + '_empwgt_' +kvar + '.pdf')
#
# '''
# bank graphs
# '''
# # dfb = opennew('pppfiles/mergedbank', ['min_dist_bank_10', 'ldb_num'])
# # dft = opennew('pppfiles/'+filename + '_all', ['ldb_num', 'Deligible', 'Dppp', 'aaemp'])
# # dft = sqlmerge(dft, dfb, ['ldb_num'])
# # bins = [-1,.5,1,5,10,9e9]
# # distbins = ['0-.5 Miles', '.5-1 miles', '1-5 miles', '5-10 miles', '10+ miles']
# # dft['bank_dist_bins'] = pd.cut(dft['min_dist_bank_10'].to_pandas(), bins=bins, labels = distbins).astype('str')
# # dft['bank_dist_bins'] = dft['bank_dist_bins'].fillna('10+ miles')
# # cond = (dft['bank_dist_bins'] == 'nan').to_array()
# # dft['bank_dist_bins'] = np.where(cond, ['10+ miles']*len(cond), dft['bank_dist_bins'].to_pandas())
# # bankdict = {
# #     'bank_dist_bins' : [distbins, 'Distance to Nearest Bank']
# # }
# # gdict = {
# #     #'allldb' : ['All LDB'],
# #     'Deligible' : ['Eligible Establishments', 'SELECT * FROM dfldb WHERE dfldb.Deligible=1'],
# #     'Dppp' : ['PPP Recipients', 'SELECT * FROM dfldb WHERE dfldb.Dppp=1']
# # }
# #
# #
# #
# #
# #
# #
# # for kvar, tlist in bankdict.items():
# #     bc.create_table('dfldb',dft)
# #     roworder = tlist[0]
# #     xlab = tlist[1]
# #     #graph
# #     x = np.arange(len(roworder))  # the label locations
# #     numcats = len(gdict)
# #     width = 0.25  # the width of the bars
# #     groupwidth = width * numcats
# #     graphnum = graphnum + 1
# #     numcols = 1
# #     numrows = 1
# #     f, ax = plt.subplots(figsize=(8 * numcols, 7.2 * numrows), constrained_layout=True)
# #     #need a nudge to push x to the right - i'm not sure exactly why it won't line
# #     # up without this, but this works for 2 categories, might generalize
# #     xnudge = .6 * width
# #     #
# #     i=0
# #     for k,v in gdict.items():
# #         ax.bar(x - groupwidth/2 + xnudge + i *width, bc.sql(v[1]).groupby(kvar)['ldb_num'].count().reindex(roworder).to_pandas() / bc.sql(v[1])['ldb_num'].count(),
# #                width, label= v[0])
# #         i = i+1
# #     # Add some text for labels, title and custom x-axis tick labels, etc.
# #     ax.set_xlabel(xlab)
# #     ax.set_ylabel('Proportion')
# #     ax.set_title('Establishment Distribution Within\nSame-Colored Bars')
# #     ax.set_xticks(x)
# #     ax.set_xticklabels(roworder, Rotation=90)
# #     ax.legend()
# #     f.tight_layout()
# #     plt.savefig(resultsloc + filename + '_' +kvar + '.pdf')
# #     ymin, ymax = ax.get_ylim()
# #     ylim = [ymin, ymax]
# #     #empweighted
# #     f, ax = plt.subplots(figsize=(8 * numcols, 7.2 * numrows), constrained_layout=True)
# #     #
# #     i=0
# #     for k,v in gdict.items():
# #         ax.bar(x - groupwidth/2 + xnudge + i *width, bc.sql(v[1]).groupby(kvar)['aaemp'].sum().reindex(roworder).to_pandas() / bc.sql(v[1])['aaemp'].sum(),
# #                width, label= v[0])
# #         i = i+1
# #     # Add some text for labels, title and custom x-axis tick labels, etc.
# #     ax.set_xlabel(xlab)
# #     ax.set_ylabel('Proportion')
# #     ax.set_title('Employment Distribution Within\nSame-Colored Bars')
# #     ax.set_xticks(x)
# #     ax.set_xticklabels(roworder, Rotation=90)
# #     #might need to change this if axes wildly change for employment weighted for some reason
# #     ax.set_ylim(ylim)
# #     ax.legend()
# #     f.tight_layout()
# #     plt.savefig(resultsloc + filename + '_empwgt_' +kvar + '.pdf')
#
#
# '''
# county map
# '''
# # dfldb = opennew('pppfiles/'+filename + '_all', ['ldb_num', 'Deligible', 'Dppp','fipsstate', 'fipscounty', 'aaemp'])
# # cond = (dfldb['Deligible'] == 1)
# # dfldb[['fipsstate', 'fipscounty']] = dfldb[['fipsstate', 'fipscounty']].fillna(-1)
# # dfldb['fips'] = dfldb[['fipsstate', 'fipscounty']].to_pandas().apply(lambda x: str(str(int(x[0])).zfill(2)) + str(str(int(x[1])).zfill(3)), axis=1)
# # dfldb['Dppp_wgt'] = dfldb['aaemp'] * dfldb['Dppp']
# # g1 = ['fips']
# # df1 = dfldb.loc[cond].groupby(g1)['Dppp'].mean().reset_index()
# # df1b = dfldb.loc[cond].groupby(g1)[['Dppp_wgt', 'aaemp']].sum().reset_index()
# # df1b['Dppp_wgt'] = df1b['Dppp_wgt'] / df1b['aaemp']
# # dftabs = [df1, df1b]
# # dfexplain = ['Percentage Takeup Rate', 'Takeup Rate, Employment Weighted',]
# # dfindexdict = {'sheet number': [i + 1 for i in range(len(dfexplain))],
# #                'description': [i for i in dfexplain] }
# # dfindex = pd.DataFrame(dfindexdict)
# # dftabs = [dfindex] + dftabs
# # writer = pd.ExcelWriter(resultsloc + filename  + "_counties.xlsx")
# #
# # excelmake(dftabs, writer)
# #
# # writer.save()
# # writer.close()
# '''
# ui and ein info
# '''
# print('eidl')
#
# # def geteidl(numcuts):
# #     dfldb = opennew('pppfiles/' + filename + '_all',  [
# #         #'BusinessName',
# #         'D_eidl_grant',
# #         'D_eidl_loan',
# #         'D_naics72_or_franchise',
# #         'DateApproved_ein',
# #         'DateApproved_ui',
# #         'Deligible',
# #         'Dppp',
# #         'Dppp_ein',
# #         'Dppp_ui',
# #         'LoanAmount',
# #         'LoanAmount_ein',
# #         'LoanAmount_ui',
# #         'cutoff_score',
# #         'cutoff_testcase',
# #         'eidlgrant_date',
# #         'eidlloan_date',
# #         #'employer_fuzzy',
# #         'fipscounty',
# #         'fipsstate',
# #         'fuzzy_group',
# #         'fuzzy_type',
# #         'ldb_cosine_fuzzy',
# #         'ldb_num',
# #         'ppp_date',
# #         'run_num',
# #         'ui_acct','DateApproved'])
# #     quants = [j/numcuts for j in range(numcuts)]
# #     ldblist = [float(j) for j in dfldb['ldb_num'].quantile(quants).values]+ [9e9]
# #     dft = opennew('pppfiles/combinedeidl', [])
# #     dft['ldb_num'] = dft['ldb_num'].fillna(dft['ldb_num0'])
# #     kcols = ['ldb_num', 'D_eidl_loan', 'eidlloan_date', 'D_eidl_grant',
# #              'eidlgrant_date']
# #     mc = ['ldb_num']
# #     dft = dft[kcols].drop_duplicates(mc)
# #     tldb = []
# #     for i in range(numcuts):
# #         condl = (dfldb['ldb_num'] >= ldblist[i]) & (dfldb['ldb_num'] < ldblist[i + 1])
# #         condr = (dft['ldb_num'] >= ldblist[i]) & (dft['ldb_num'] < ldblist[i + 1])
# #         tldb.append(sqlmerge(dfldb.loc[condl], dft.loc[condr], mc))
# #     return(tldb)
# # numcuts = 5
# # dfldb = cudf.concat(geteidl(numcuts), ignore_index=True)
# # #dummy for ppp receipt at UI level
# # cond= (dfldb['LoanAmount_ui'] > 0).to_array()
# # dfldb['Dppp_ui'] = np.where(cond, 1, 0)
# # #dummy for ppp receipt at EIN level
# # cond= (dfldb['LoanAmount_ein'] > 0).to_array()
# # dfldb['Dppp_ein'] = np.where(cond, 1, 0)
# # dfldb[['Dppp', 'Dppp_ui', 'Dppp_ein', 'D_eidl_loan', 'D_eidl_grant']] = dfldb[['Dppp', 'Dppp_ui', 'Dppp_ein', 'D_eidl_loan', 'D_eidl_grant']].fillna(0)
# # dfldb['ppp_date'] = pd.to_datetime(dfldb['DateApproved'].to_pandas())
# # #'BusinessName', 'employer_fuzzy',
# # kcols = ['ldb_num', 'fipsstate', 'ui_acct', "run_num", 'Dppp', 'Dppp_ui', 'Dppp_ein','D_eidl_loan', 'D_eidl_grant', 'LoanAmount', 'ppp_date', "LoanAmount_ui", 'LoanAmount_ein', 'DateApproved_ui', 'DateApproved_ein', 'ldb_cosine_fuzzy', 'fuzzy_group', 'fuzzy_type', 'fipscounty']
# # dfldb[kcols].to_csv(dataloc + 'qcew_ppp_eidl_030821.csv')
# # import zipfile
# # from zipfile import ZipFile
# # # writing files to a zipfile
# # with ZipFile(dataloc + 'qcew_ppp_eidl_030821.zip',mode='w', compression=zipfile.ZIP_DEFLATED) as zip:
# #     # writing each file one by one
# #     file = dataloc + 'qcew_ppp_eidl_030821.csv'
# #     zip.write(file, arcname = 'qcew_ppp_eidl_030821.csv')
# #     file = dataloc + 'readme.txt'
# #     zip.write(file, arcname = 'readme.txt')
# # kcols = ['ldb_num', 'fipsstate', 'ui_acct', "run_num", 'Dppp', 'Dppp_ui', 'Dppp_ein', 'D_eidl_loan', 'D_eidl_grant','ppp_date',
# #          'eidlloan_date', 'eidlgrant_date', 'Deligible', 'cutoff_testcase', 'D_naics72_or_franchise', "LoanAmount_ui",
# #          'LoanAmount_ein', 'DateApproved_ui', 'DateApproved_ein', 'cutoff_score']
# # savenew(dfldb[kcols], 'pppfiles/qcew_ppp_eidl_030821')
#
#
